# load raw data files
# data <- read.csv("../data/filledDatabase.csv")[,-c(2:9,11:13)]
data <- read.csv("../data/filledDatabaseNUMONLY_042620.csv")
# clean data
data <- clean_data(data) %>% collapse_data()
# separate compound and group_cate from the predictors
compound <- data$Compound
group_cat <- data$GroupCat
# prepare date for modeling
data <- select(data, -c("Compound"))
# data_pca <- get_pc_space(data[,-1], k = 13) %>% scale() %>% data.frame()
# split data into 5 folds for cross validation later
folds <- caret::createFolds(1:nrow(data), k = 5, list = TRUE, returnTrain = FALSE)
Multinomial Regression
library(glmnet)
X = data[,-1] %>% as.matrix()
Y = data$GroupCat %>% as.matrix()
Coefficient
Ridge
ridge_cv <- cv.glmnet(x = X, y = Y, alpha = 0, nfolds = 5, type.measure = "deviance", family = "multinomial")
ridge_cv %>%
get_coef(tuning_parameter = ridge_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Hexagonal, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

LASSO
lasso_cv <- cv.glmnet(x = X, y = Y, alpha = 1, nfolds = 5, type.measure = "deviance", family = "multinomial")
lasso_cv %>%
get_coef(tuning_parameter = lasso_cv$lambda.min) %>%
select(feature, Cubic, Tilted, Hexagonal, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

Elastic Net
library(caret)
elastic_cv <-
train(GroupCat ~., data = data, method = "glmnet",
trControl = trainControl("cv", number = 5),
tuneLength = 10
)
elastic_cv$finalModel %>%
get_coef(tuning_parameter = elastic_cv$bestTune$lambda) %>%
select(feature, Cubic, Tilted, Others) %>%
filter(feature != "(Intercept)") %>%
plot_coef()

Accurate classification rate
Ridge
tb_ridge = prediction_table(alpha = 0, lambda = ridge_cv$lambda.min)
tb_ridge$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.6904762
|
0.702381
|
0.7471264
|
0.8214286
|
0.7176471
|
0.7358118
|
tb_ridge$t %>% highlight_tb_count()
|
|
Cubic
|
Tilted
|
Hexagonal
|
Others
|
|
Cubic
|
71
|
14
|
17
|
7
|
|
Tilted
|
26
|
186
|
2
|
24
|
|
Hexagonal
|
6
|
0
|
30
|
3
|
|
Others
|
4
|
9
|
0
|
25
|
|
Total
|
107
|
209
|
49
|
59
|
tb_ridge$t %>% highlight_tb_percent()
|
|
Cubic
|
Tilted
|
Hexagonal
|
Others
|
|
Cubic
|
0.66
|
0.07
|
0.35
|
0.12
|
|
Tilted
|
0.24
|
0.89
|
0.04
|
0.41
|
|
Hexagonal
|
0.06
|
0
|
0.61
|
0.05
|
|
Others
|
0.04
|
0.04
|
0
|
0.42
|
|
Total
|
100%
|
100%
|
100%
|
100%
|
LASSO
tb_lasso = prediction_table(alpha = 1, lambda = lasso_cv$lambda.min)
tb_lasso$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.7142857
|
0.7261905
|
0.7356322
|
0.8214286
|
0.7294118
|
0.7453897
|
tb_lasso$t %>% highlight_tb_count()
|
|
Cubic
|
Tilted
|
Hexagonal
|
Others
|
|
Cubic
|
74
|
14
|
14
|
7
|
|
Tilted
|
23
|
186
|
3
|
23
|
|
Hexagonal
|
5
|
0
|
31
|
4
|
|
Others
|
5
|
9
|
1
|
25
|
|
Total
|
107
|
209
|
49
|
59
|
tb_lasso$t %>% highlight_tb_percent()
|
|
Cubic
|
Tilted
|
Hexagonal
|
Others
|
|
Cubic
|
0.69
|
0.07
|
0.29
|
0.12
|
|
Tilted
|
0.21
|
0.89
|
0.06
|
0.39
|
|
Hexagonal
|
0.05
|
0
|
0.63
|
0.07
|
|
Others
|
0.05
|
0.04
|
0.02
|
0.42
|
|
Total
|
100%
|
100%
|
100%
|
100%
|
Elastic Net
tb_elastic = prediction_table(alpha = elastic_cv$bestTune[[1]], lambda = elastic_cv$bestTune[[2]])
tb_elastic$r %>% print_accurate_tb()
|
Fold1
|
Fold2
|
Fold3
|
Fold4
|
Fold5
|
Mean
|
|
0.7142857
|
0.7142857
|
0.7471264
|
0.797619
|
0.7411765
|
0.7428987
|
tb_elastic$t %>% highlight_tb_count()
|
|
Cubic
|
Tilted
|
Hexagonal
|
Others
|
|
Cubic
|
73
|
16
|
13
|
11
|
|
Tilted
|
23
|
180
|
2
|
18
|
|
Hexagonal
|
6
|
1
|
34
|
2
|
|
Others
|
5
|
12
|
0
|
28
|
|
Total
|
107
|
209
|
49
|
59
|
tb_elastic$t %>% highlight_tb_percent()
|
|
Cubic
|
Tilted
|
Hexagonal
|
Others
|
|
Cubic
|
0.68
|
0.08
|
0.27
|
0.19
|
|
Tilted
|
0.21
|
0.86
|
0.04
|
0.31
|
|
Hexagonal
|
0.06
|
0
|
0.69
|
0.03
|
|
Others
|
0.05
|
0.06
|
0
|
0.47
|
|
Total
|
100%
|
100%
|
100%
|
100%
|